Census-seq

Author

Maddie Rea

Published

July 25, 2024

Maddie’s Presentation

Welcome to my presentation! Today I’m going to showcase what I’ve been doing over the summer. This is the culmination of most of my work, I welcome and encourage feedback before my poster presentation!

Code
library(ggplot2)
library(MASS) 
library(reshape2) 
library(reshape) 
library(patchwork)
library(ggpubr)
library(plotly)
library(dplyr)
library(readr)
library(crayon)
library(pheatmap)
Code
compare_vcfs = function(quilt_input,geno_input,census_input,x_order){
  census_input = census_input[order(census_input$V1),]
  quilt_input$READS = census_input$V2
  meltedbar = subset(quilt_input, select = c(DONOR,READS,COUNT))
  meltedbar = melt(meltedbar,id="DONOR")
  colnames(meltedbar) <- c('Donor','Filtered','Reads')
  p1 <- ggplot(data=meltedbar, aes(x=Donor,y=Reads, fill=Filtered)) +
  geom_bar(position="identity",stat="identity") +
  scale_fill_manual(values=c("#D41159", "#69C561")) +
  scale_x_discrete(limits = x_order) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  theme(
    panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
  )
  
  quilt_input$G_REP = geno_input$REPRESENTATION
  quilt_input$Genoprobs = quilt_input$G_REP - quilt_input$KNOWN
  quilt_input$Quilt = quilt_input$REPRESENTATION - quilt_input$KNOWN
  meltedline = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
  meltedline = melt(meltedline,id="DONOR")
  colnames(meltedline) <- c('Donor','VCF','Differences')
  p2 <- ggplot(data = meltedline) +
  geom_hline(yintercept=0,size=0.5)+
  geom_line(aes(x = Donor, y = Differences, colour = VCF, group = VCF)) +
  geom_point(aes(x = Donor, y = Differences, colour = VCF, group = VCF))+
  scale_x_discrete(limits = x_order) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  scale_color_manual(values=c('#063970', '#A69943')) +
  theme(
    panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
  )
  
  quilt_input$Genoprobs = ((quilt_input$G_REP - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
  quilt_input$Quilt = ((quilt_input$REPRESENTATION - quilt_input$KNOWN) / quilt_input$KNOWN) * 100
  meltedline_error = subset(quilt_input, select = c(DONOR,Quilt,Genoprobs))
  meltedline_error = melt(meltedline_error,id="DONOR")
  colnames(meltedline_error) <- c('Donor','VCF','PercentError')
  p3 <- ggplot(data = meltedline_error) +
  geom_hline(yintercept=0,size=0.5)+
  geom_line(aes(x = Donor, y = PercentError, colour = VCF, group = VCF)) +
  geom_point(aes(x = Donor, y = PercentError, colour = VCF, group = VCF))+
  scale_x_discrete(limits = x_order) +
  scale_color_manual(values=c('#063970', '#A69943')) +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  theme(
    panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
  )

stacked_plots <- p2 + p3 + p1 + plot_layout(ncol = 1)
stacked_plots
}
Code
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")

hold = log_census[order(log_census$V2),]
hold = hold$V1

Equal Proportion VCF Comparison

Code
equal_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_quilt.census.txt",header=TRUE,sep="\t",skip=2)
equal_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_geno.census.txt",header=TRUE,sep="\t",skip=2)
equal_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/equal_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(equal_quilt,equal_geno,equal_census,hold)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

The bottom bargraph represents the proportion of reads for each donor within the “pool” of samples. The total height of the bar represents the reads per sample inputted into Census-seq, but the red on top represents reads which did not pass the quality control. The line graph on top represents the difference between the true proprotion and the estimated proportions. Dots over the line are overestimations and dots under the line are underestimations.

Log Proportion VCF Comparison

Code
log_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_quilt.census.txt",header=TRUE,sep="\t",skip=2)
log_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_geno.census.txt",header=TRUE,sep="\t",skip=2)
log_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/log_census_report.tsv",header=FALSE,sep="\t")

compare_vcfs(log_quilt,log_geno,log_census,hold)

Random Proportion VCF Comparison

Code
random_quilt = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_quilt.census.txt",header=TRUE,sep="\t",skip=2)
random_geno = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_geno.census.txt",header=TRUE,sep="\t",skip=2)
random_census = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/sixteen_vcfs_sample/random_census_report.tsv",header=FALSE,sep="\t")
compare_vcfs(random_quilt,random_geno,random_census,hold)

Code
# library(reticulate)
# conda_create(envname = "myenv")
# conda_install( envname = "myenv", packages = c("seaborn","pandas","matplotlib","plotly"))
# use_condaenv(condaenv = "myenv",required = TRUE  )
Code
# import os
# import numpy as np
# import seaborn as sb
# import pandas as pd
# import matplotlib.pyplot as plt
# import plotly.express as px
# 
# sample_amounts = []
# read_numbers = []
# percent_errors = []
# proportion_types = []
# 
# def calculate_pe(df):
#   df['PERCENT_ERROR'] = abs(df['REPRESENTATION'] - df['KNOWN']) / df['KNOWN'] * 100
#   av_percent_error = df['PERCENT_ERROR'].mean()
#   return av_percent_error
# 
# for root,dirs,files, in os.walk('/flashscratch/munger-rea/bwa-mem/'):
#   for file in files:
#     if file == "my.census.txt":
#       file_path = os.path.join(root,file)
#       print(file_path)
#         
#       path_parts = root.split('/')
#       sample_amount = path_parts[-4]
#       proportion_type = path_parts[-3]
#       read_number = path_parts[-2]
#       
#       df = pd.read_csv(os.path.join(root, file), sep="\t",skiprows=2)
#       percent_error = calculate_pe(df)
#       sample_amounts.append(sample_amount)
#       read_numbers.append(read_number)
#       percent_errors.append(percent_error)
#       proportion_types.append(proportion_type)
#       
# percent_errors = [float(i) for i in percent_errors]
# 
# reads_to_numbers = {
#         'h_thousand': 100000,
#         'o_million': 1000000,
#         't_million': 10000000,
#         'f_million': 50000000,
#         'h_million': 100000000
# }
# 
# read_numbers = [reads_to_numbers[i] for i in read_numbers]
# 
# samples_to_numbers = {
#         's4': 4,
#         's8': 8,
#         's16': 16,
#         's32': 32,
#         's48': 48
# }
# 
# sample_amounts = [samples_to_numbers[i] for i in sample_amounts]
# 
# df = pd.DataFrame({
#     'Reads': read_numbers,
#     'Samples': sample_amounts,
#     'Percent_Errors': percent_errors,
#     'Proportion_Type': proportion_types
# })
# 
# fig = px.scatter_3d(df, x='Samples', y='Reads', z='Percent_Errors',
#               color='Proportion_Type')
# fig.show()

3D Percent Error Graph

Code
sample_amounts <- c()
read_numbers <- c()
percent_errors <- c()
proportion_types <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  sample_amount <- path_parts[length(path_parts) - 4]
  proportion_type <- path_parts[length(path_parts) - 3]
  read_number <- path_parts[length(path_parts) - 2]
  
  df <- read_tsv(file_path, skip = 2)
  percent_error <- calculate_pe(df)
  sample_amounts <- c(sample_amounts, sample_amount)
  read_numbers <- c(read_numbers, read_number)
  percent_errors <- c(percent_errors, percent_error)
  proportion_types <- c(proportion_types, proportion_type)
}

percent_errors <- as.numeric(percent_errors)

reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

read_numbers <- sapply(read_numbers, function(x) reads_to_numbers[[x]])

samples_to_numbers <- list(
  's4' = 4,
  's8' = 8,
  's16' = 16,
  's32' = 32,
  's48' = 48
)

sample_amounts <- sapply(sample_amounts, function(x) samples_to_numbers[[x]])

df <- data.frame(
  'Reads' = read_numbers,
  'Samples' = sample_amounts,
  'Percent_Error' = percent_errors,
  'Proportion_Type' = proportion_types
)

fig <- plot_ly(df, x = ~Samples, y = ~Reads, z = ~Percent_Error, color = ~Proportion_Type, colors = c('#648FFF','#DC267F','#FFB000'))
fig <- fig %>% add_markers(marker = list(opacity = 0.5))
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Number of Samples'),
                     yaxis = list(title = 'Number of Reads'),
                     zaxis = list(title = 'Percent Error')))

fig

Random Proportion versus Percent Error Animated

Code
library(dplyr)
library(readr)
library(plotly)

read_numbers <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/random/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  read_number <- path_parts[length(path_parts) - 2]
  run_number <- path_parts[length(path_parts) - 1]
  
  df <- read_tsv(file_path, skip = 2)
  
  df <- df %>%
    mutate(DONOR = paste0(DONOR, "_", run_number),
           Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
           Reads = read_number)
  
  results <- rbind(results, df)
  
}


reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)

results$DONOR <- as.factor(results$DONOR)

fig <- results %>%
  plot_ly(
    x = ~KNOWN,
    y = ~Percent_Error,
    frame = ~Reads,
    text = ~DONOR,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )

fig

Logistic Proportion versus Percent Error Animated

Code
library(dplyr)
library(readr)
library(plotly)
library(gapminder)

read_numbers <- c()

calculate_pe <- function(df) {
  df$PERCENT_ERROR <- abs(df$REPRESENTATION - df$KNOWN) / df$KNOWN * 100
  av_percent_error <- mean(df$PERCENT_ERROR)
  return(av_percent_error)
}

files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/genoprobs_runs/s48/log/", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)

results <- data.frame(Proportion = numeric(), Percent_Error = numeric(), Reads = character())

for (file_path in files) {
  
  path_parts <- strsplit(file_path, "/")[[1]]
  read_number <- path_parts[length(path_parts) - 2]
  run_number <- path_parts[length(path_parts) - 1]
  
  df <- read_tsv(file_path, skip = 2)
  
  df <- df %>%
    mutate(DONOR = paste0(DONOR, "_", run_number),
           Percent_Error = abs(REPRESENTATION - KNOWN) / KNOWN * 100,
           Reads = read_number)
  
  results <- rbind(results, df)
  
}


reads_to_numbers <- list(
  'h_thousand' = 100000,
  'o_million' = 1000000,
  't_million' = 10000000,
  'f_million' = 50000000,
  'h_million' = 100000000
)

results$Reads <- sapply(results$Reads, function(x) reads_to_numbers[[x]])
results$Reads <- as.integer(results$Reads)

results$DONOR <- as.factor(results$DONOR)

fig <- results %>%
  plot_ly(
    x = ~KNOWN,
    y = ~Percent_Error,
    frame = ~Reads,
    text = ~DONOR,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )

fig

Different Aligners

Code
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$minimap = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie2 = bowtie$REPRESENTATION

bwa$STAR = (abs(bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$minimap = (abs(bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = (abs(bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie2 = (abs(bwa$bowtie2 - bwa$KNOWN) / bwa$KNOWN) * 100

meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie2,minimap,STAR))
meltedviolin = melt(meltedviolin,id="DONOR")
colnames(meltedviolin) <- c('Donor','Aligner','PercentError')

ggplot(meltedviolin, aes(x=Aligner,y=PercentError,fill=Aligner)) + 
  geom_violin() +
  scale_fill_brewer(palette="Dark2") + 
  geom_dotplot(binaxis='y',stackdir = 'center',dotsize=1,color="black") +
  labs(title="Aligner Type versus Percent Error",x="Aligner", y = "Percent Error") +
  theme_minimal() 

Aligner Heatmap

Code
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION
bwa$mini = mini$REPRESENTATION
bwa$bwa = bwa$REPRESENTATION
bwa$bowtie = bowtie$REPRESENTATION

bwa$STAR = ((bwa$STAR - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$mini = ((bwa$mini - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bwa = ((bwa$bwa - bwa$KNOWN) / bwa$KNOWN) * 100
bwa$bowtie = ((bwa$bowtie - bwa$KNOWN) / bwa$KNOWN) * 100

bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)

meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))

df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL

pheatmap(df)

Code
#meltedviolin = melt(meltedviolin,id="DONOR")
#colnames(meltedviolin) <- c('Donor','Aligner','PercentError')
# heatmap_plot = ggplot(data=meltedviolin, aes(x=Donor,y=Aligner)) +
#   geom_tile(aes(fill=PercentError)) +
#   scale_fill_gradient2() +
#   theme(axis.text.y = element_text(size = 6)) + 
#   theme_minimal()
Code
bowtie = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bowtie.census.txt",sep="\t",skip=2,header=TRUE)
mini = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/mini.census.txt",sep="\t",skip=2,header=TRUE)
bwa = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/bwa.census.txt",sep="\t",skip=2,header=TRUE)
STAR = read.csv("/projects/munger-lab/projects/census_seq_SSP2024/diff_aligners_test/STAR.census.txt",sep="\t",skip=2,header=TRUE)

bwa$STAR = STAR$REPRESENTATION - STAR$KNOWN
bwa$mini = mini$REPRESENTATION - mini$KNOWN
bwa$bwa = bwa$REPRESENTATION - bwa$KNOWN
bwa$bowtie = bowtie$REPRESENTATION - bowtie$KNOWN

bwa$DONOR = sub("plexWell-", "", bwa$DONOR)
bwa$DONOR = sub("_GT23-023.._.*", "", bwa$DONOR)

meltedviolin = subset(bwa, select = c(DONOR,bwa,bowtie,mini,STAR))

df = as.data.frame(meltedviolin)
rownames(df) = df$DONOR
df$DONOR = NULL

pheatmap(df)

One Million Reads In Silico versus Inferred

Code
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 1000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

One Million Reads Passed Reads versus Inferred

Code
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_one", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 1000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

Ten Million Reads In Silico versus Inferred

Code
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 10000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

Ten Million Reads Passed Reads versus Inferred

Code
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_ten", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 10000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

One Hundred Million Reads In Silico versus Inferred

Code
files <- list.files(path = "/projects/munger-lab/projects/census_seq_SSP2024/replication_runs/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 100000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = X2, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()


final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$X2) / final_df$X2 * 100

final_df = final_df %>%
  group_by(X2) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = X2, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (in silico mixture)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

One Hundred Million Reads Passed Reads versus Inferred

Code
files <- list.files(path = "/flashscratch/munger-rea/tiny_prop_hundred", pattern = "my.census.txt", recursive = TRUE, full.names = TRUE)
dirs <- dirname(files)

merged_dfs <- list()

for (dir in dirs) {
  # Construct the file paths
  file_path_txt <- file.path(dir, "my.census.txt")
  file_path_tsv <- file.path(dir, "census_report.tsv")
  df_txt <- read_tsv(file_path_txt, skip = 2)
  df_tsv <- read_tsv(file_path_tsv, col_names=FALSE)
  merged_df <- merge(df_txt, df_tsv, by.x = "DONOR", by.y = "X1", all.x = TRUE)
  merged_df$X2 = merged_df$X2 / 100000000
  
  merged_dfs[[length(merged_dfs) + 1]] <- merged_df
}

final_df <- do.call(rbind, merged_dfs)
final_df <- final_df[-which(final_df$X2 > 0.1),]


p1 <- ggplot(final_df, aes(x = KNOWN, y = REPRESENTATION)) + 
  geom_point(shape=1) + 
  geom_abline(slope=1, intercept=0,colour='#E41A1C') +
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.016)) +
  scale_y_continuous(name="Donor Fraction of village (inferred by Census-seq)", limits=c(0, 0.016)) +
  theme_minimal()

final_df$Percent_Error = abs(final_df$REPRESENTATION - final_df$KNOWN) / final_df$KNOWN * 100

final_df = final_df %>%
  group_by(KNOWN) %>%
  mutate(Median_Percent_Error = median(Percent_Error))

p2 <- ggplot(final_df, aes(x = KNOWN, y = Median_Percent_Error)) + 
  geom_point() + 
  geom_line() + 
  scale_x_continuous(name="Donor Fraction of village (passed reads)", limits=c(0, 0.0105)) +
  scale_y_continuous(name="Median Percent Error", limits=c(0, 100)) +
  theme_minimal()

p1 + p2

Code
library(plotly)

dens <- with(diamonds, tapply(price, INDEX = cut, density))
data <- data.frame(
  x = unlist(lapply(dens, "[[", "x")),
  y = unlist(lapply(dens, "[[", "y")),
  cut = rep(names(dens), each = length(dens[[1]]$x)))

fig <- plot_ly(data, x = ~x, y = ~y, z = ~cut, type = 'scatter3d', mode = 'lines', color = ~cut)

fig